1 Load Packages

library(tidyverse)
library(statnet)
library(cvTools)
library(furrr)
library(glasso)
library(mvtnorm)
# remotes::install_github('leifeld/xergm')
library(xergm)
library(ndtv)
library(recipes)
library(bestNormalize)
library(changepoint)
theme_set(theme_bw())

2 Import Data

data = read_csv('data.csv')

3 Data Preprocess

Generate Year and Quarter.

data = data |> 
  mutate(Year = year(Date),
         Year = as_factor(Year),
         Quarter = quarter(Date),
         Quarter = as_factor(Quarter)) |> 
  select(Year,Quarter,everything(),-Date)

Check number of observation for each combination of Year and Quarter.

data |> 
  count(Year,Quarter)
# A tibble: 41 × 3
   Year  Quarter     n
   <fct> <fct>   <int>
 1 2014  2          18
 2 2014  3          65
 3 2014  4          64
 4 2015  1          63
 5 2015  2          61
 6 2015  3          65
 7 2015  4          64
 8 2016  1          62
 9 2016  2          63
10 2016  3          65
# ℹ 31 more rows

Drop observations for the 2nd quarter of 2014 and the 2nd quarter of 2024.

data = data |> 
  filter(!((Year==2014 & Quarter==2) | (Year==2024 & Quarter==2)))

4 Find rho values

Scale and normalize each variables except for Year and Quarter.

Find optimal \(\rho\) according to the change point of metric (square) generated by cross validation for each combination of Year and Quarter.

# Function for normalizing variables for each cv-fold
scale_cv_data = function(k, folds, data){
  
  id_test = folds$subsets[folds$which==k]
  
  train_data = data |> slice(-id_test)
  test_data = data |> slice(id_test)
  
  set.seed(1)
  preprocessor = recipe(~., data = train_data) |> 
    step_best_normalize(all_numeric_predictors()) |> 
    step_normalize(all_numeric_predictors())
  
  result = list(scale_train = bake(prep(preprocessor),new_data = train_data),
                scale_test = bake(prep(preprocessor),new_data = test_data))
  
  return(result)
  
}
# Function for calculating cv-metric (square) given rho for each combination of Year and Quarter
lasso_tune = function(rho, train_data_list, test_data_list){
  
  K = length(train_data_list)
  metric = rep(NA, K)
  
  for(k in 1:K){
    
    s = cov(train_data_list[[k]])
    p = ncol(s)
    nobs = nrow(test_data_list[[k]])
    set.seed(1)
    graphical_lasso = glasso(s = s, 
                             rho = rho,
                             penalize.diagonal = FALSE)
    sigma = graphical_lasso$w
    
    log_density = dmvnorm(x = test_data_list[[k]],
                          mean = rep(0,p),
                          sigma = sigma,
                          log = TRUE)
    
    inv_cov = graphical_lasso$wi
    diag(inv_cov) = 0
    num_edge = sum(inv_cov!=0)/2
    
    metric[k] = mean(log_density, na.rm = TRUE) / ((num_edge + 1)^2)
    
  }
  
  avg_metric = mean(metric)
  
  return(avg_metric)
  
}
# Candidate rho values
rho_seq = seq(0.001,1,0.001)

# Combinations of Year and Quarter
combination = data |> 
  distinct(Year,Quarter) |> 
  mutate(rho = NA)

# Number of folds in cv
K = 10

# Find optimal rho according to the change point of metric (square) generated by cross validation for each combination of Year and Quarter
for(i in 1:nrow(combination)){
  
  # Data for specific Year and Quarter
  temp_data = data |> 
    filter(Year==combination$Year[i],
           Quarter==combination$Quarter[i]) |> 
    select(-Year,-Quarter)
  
  set.seed(1)
  # Create cv-folds for specific Year and Quarter data
  temp_folds = cvFolds(n = nrow(temp_data), K = K)
  
  # Normalize variables for each fold
  plan(multisession, workers = parallel::detectCores()-1)
  temp_scale_data_list = future_map(.x = 1:K,
                                    .f = scale_cv_data,
                                    folds = temp_folds,
                                    data = temp_data,
                                    .progress = TRUE)
  
  temp_scale_train_data = map(.x = temp_scale_data_list,
                              .f = ~.x$scale_train)
  
  temp_scale_test_data = map(.x = temp_scale_data_list,
                             .f = ~.x$scale_test)
  
  # Calculate cv-metric (square) for each rho value on the data corresponding to specific combination of Year and Quarter
  plan(multisession, workers = parallel::detectCores()-1)
  temp_metric = future_map_dbl(.x = rho_seq,
                               .f = lasso_tune,
                               train_data_list = temp_scale_train_data,
                               test_data_list = temp_scale_test_data,
                               .progress = TRUE)

  # Calculate the first-order and second-order derivatives of the cv-metric corresponding to each rho
  temp_derivatives = tibble(rho = rho_seq,
                            metric = temp_metric) |>
    mutate(slope1 = c(NA,diff(metric)) / c(NA,diff(rho)),
           slope2 = c(NA,diff(slope1)) / c(NA,diff(rho)))
  
  # Extract the second-order derivatives for rho fall in [0.05, 0.95]
  temp_slope2 = temp_derivatives |>
    filter(between(rho,0.05,0.95)) |>
    select(rho,slope2)

  # Find the optimal change point in variance
  set.seed(1)
  temp_opt_cpt = cpt.var(data = temp_slope2$slope2, # the second-order derivatives
                         penalty = 'MBIC', # penalty metric for calculate optimal positioning of change point
                         method = 'BinSeg',
                         test.stat = 'Normal', # assumed test distribution of the data
                         Q = 1) # maximum number of changepoints to search for using the "BinSeg" method
  
  # Store the optimal rho for specific Year and Quarter data
  combination$rho[i] = temp_slope2$rho[cpts(temp_opt_cpt)]
  
}

Check whether the results are reproducible.

mean(combination$rho) # 0.8047179
[1] 0.8047179

5 Create Networks

Create network for each combination of Year and Quarter.

network_data = list()
relation_data = list()
# plots = list()

for(i in 1:nrow(combination)){
  
  # # Data for specific Year and Quarter
  temp_data = data |> 
    filter(Year==combination$Year[i],
           Quarter==combination$Quarter[i]) |> 
    select(-Year,-Quarter)
  
  set.seed(1)
  # Normalize variables for specific Year and Quarter data
  temp_preprocessor = recipe(~., data = temp_data) |>
    step_best_normalize(all_numeric_predictors()) |>
    step_normalize(all_numeric_predictors())
  
  temp_scale_data = bake(prep(temp_preprocessor),new_data = temp_data)
  
  # Estimate a graphical lasso for specific Year and Quarter data
  graphical_lasso = glasso(s = cov(temp_scale_data), rho = combination$rho[i])
  
  # Extract estimated inverse covariance matrix from graphical lasso
  inv_sigma = graphical_lasso$wi
  diag(inv_sigma) = 0
  rownames(inv_sigma) = colnames(temp_scale_data)
  colnames(inv_sigma) = colnames(temp_scale_data)
  
  # Generate a network using estimated inverse covariance matrix
  network_data[[i]] = as.network(x = inv_sigma, # binary matrix for relationship
                                 directed = FALSE,
                                 vertices = colnames(inv_sigma), # name of vertices
                                 matrix.type = 'adjacency') |> 
    # set vertex attribute "num_relationship" for number of relationship
    set.vertex.attribute(attrname = 'num_relationship',
                         value = colSums(inv_sigma!=0)) |> 
    # set edge attribute "degree_relationship" for relationship strength
    set.edge.value(attrname = 'degree_relationship',
                   value = inv_sigma) |>
    # set network attribute "degree_relationship" for relationship strength
    set.network.attribute(attrname = 'degree_relationship',
                          value = inv_sigma)
  
  relation_data[[i]] = inv_sigma
  
}

names(network_data) = str_c(combination$Year,
                            combination$Quarter,
                            sep = '-')

6 tergm

Estimate a TERGM by MPLE with temporal bootstrapping using ndependent covariates as following.

set.seed(1)
tergmFit1 = btergm(formula = network_data ~ edges +
                     triangle + kstar(k = 5) +
                     transitiveties +
                     edgecov('degree_relationship') +
                     nodematch('num_relationship') +
                     nodecov('num_relationship') +
                     absdiff('num_relationship') +
                     gwdegree(decay = 0.5,fixed = TRUE) +
                     gwesp(decay = 0.5, fixed = TRUE) +
                     memory(type = 'stability') +
                     timecov(transform = function(t) t) +
                     timecov(transform = function(t) t^2),
                   R = 10000, # Number of bootstrap replications
                   parallel = 'multicore',
                   ncpus = parallel::detectCores()-1,
                   usefastglm = TRUE,
                   verbose = FALSE)

summary(tergmFit1)
                               Estimate   Boot mean        2.5%       97.5%
edges                       -8.6539e+00 -1.2565e+01    -12.3793     -7.5844
triangle                    -6.5887e-02  7.4380e-02     -0.2996      0.0942
kstar5                      -1.2302e-03 -6.4653e-03     -0.0039     -0.0004
transitiveties              -4.6934e-01 -6.0191e-01     -0.9528      1.1573
edgecov.degree_relationship -6.9295e+02 -2.9240e+04 -18103.3155   -342.7819
nodematch.num_relationship  -5.7586e-01 -3.3125e+00     -1.3317      0.0902
nodecov.num_relationship     3.0900e-01  3.7542e-01      0.2453      0.4849
absdiff.num_relationship    -1.1937e-01 -1.0292e-01     -0.2260     -0.0377
gwdeg.fixed.0.5              2.7852e+00  2.9804e+00      1.8024      3.9909
gwesp.fixed.0.5              2.0609e+00  1.6242e+00      0.6964      2.6881
edgecov.memory[[i]]          2.2013e-01 -1.6489e+00      0.0389      0.3673
edgecov.timecov1[[i]]       -3.9973e-02  4.3376e-02     -0.1399      0.0642
edgecov.timecov2[[i]]        1.1145e-03 -6.2658e-04     -0.0013      0.0037

Continuously remove some non-significant independent covariates from the model until all independent covariates are statistically significant.

set.seed(1)
tergmFit2 = btergm(formula = network_data ~ edges +
                     kstar(k = 5) +
                     edgecov('degree_relationship') +
                     nodecov('num_relationship') +
                     absdiff('num_relationship') +
                     gwdegree(decay = 0.5,fixed = TRUE) +
                     gwesp(decay = 0.5, fixed = TRUE) +
                     memory(type = 'stability'),
                   R = 10000, # Number of bootstrap replications
                   parallel = 'multicore',
                   ncpus = parallel::detectCores()-1,
                   usefastglm = TRUE,
                   verbose = FALSE)

summary(tergmFit2)
                               Estimate   Boot mean        2.5%       97.5%
edges                       -8.6537e+00 -1.1854e+01    -11.3612     -7.7807
kstar5                      -1.3649e-03 -6.2609e-03     -0.0039     -0.0006
edgecov.degree_relationship -6.8587e+02 -3.9589e+04 -17935.9694   -345.9128
nodecov.num_relationship     2.8482e-01  3.3775e-01      0.2003      0.4194
absdiff.num_relationship    -7.1573e-02 -3.9644e-02     -0.1294     -0.0140
gwdeg.fixed.0.5              2.4119e+00  2.5264e+00      1.3556      3.3449
gwesp.fixed.0.5              1.6416e+00  1.6463e+00      1.2332      2.0369
edgecov.memory[[i]]          2.1827e-01 -2.0118e+00      0.0281      0.3610

7 Goodness of Fit

Assess goodness of fit of the two btergm models using dyad-wise shared partner distribution (dsp), edge-wise shared partner distribution (esp), degree distribution (deg), geodesic distance distribution (geodesic).

The model is said to fit better the closer the medians of the box plots (based on the simulated networks) come to the line that plots the actual value of these statistics in the observed network.

set.seed(1)
gof1 = gof(tergmFit1,
           nsim = 100, # number of networks to be simulated at each time step
           parallel = 'multicore',
           ncpus = parallel::detectCores()-1,
           statistics = c(dsp, esp, deg, geodesic),
           verbose = FALSE)

plot(gof1)

set.seed(1)
gof2 = gof(tergmFit2,
           nsim = 100,
           parallel = 'multicore',
           ncpus = parallel::detectCores()-1,
           statistics = c(dsp, esp, deg, geodesic),
           verbose = FALSE)

plot(gof2)

8 Validation

First, set \(t\) to be 31~39. For fixed \(w\), fit tergm using the network data of [\(t-w\),\(t-1\)] periods, and predict the network of the \(t\) period, and then calculate the average out-of-sample metrics using the real network and the predicted network of the \(t\) period across all \(t\) values, where \(w\) is the window size of training network data. Repeat this step for different \(w\) (5~30) and get corresponding average out-of-sample metrics.

window_size = tibble(w = 5:30, 
                     accuracy = NA,
                     recall = NA,
                     precision = NA,
                     f1_score = NA)
for(j in 1:nrow(window_size)){
  accuracy = c()
  recall = c()
  precision = c()
  f1_score = c()
  for(i in 31:39){
    set.seed(1)
    temp_tergmFit = btergm(formula = network_data[(i-window_size$w[j]):(i-1)] ~ edges +
                             kstar(k = 5) +
                             edgecov('degree_relationship') +
                             nodecov('num_relationship') +
                             absdiff('num_relationship') +
                             gwdegree(decay = 0.5,fixed = TRUE) +
                             gwesp(decay = 0.5, fixed = TRUE) +
                             memory(type = 'stability'),
                           R = 10000, # Number of bootstrap replications
                           parallel = 'multicore',
                           ncpus = parallel::detectCores()-1,
                           usefastglm = TRUE,
                           verbose = FALSE)
    
    # Predict the network of t period
    temp_prediction = simulate(object = temp_tergmFit,
                               nsim = 10000, # number of network to simulate
                               seed = 1)
    
    # Transform predicted networks to be matrice
    pre_network = map(.x = temp_prediction,
                      .f = as.matrix)
    
    # Real network of t period
    real_network = as.matrix(network_data[[i]])
    
    # Calculate accuracy, precision, recall, F1 score for the presence of an edge in the network of t period
    metrics = map(.x = pre_network,
                  .f = function(.x){
                    
                    # Upper Triangular Part of Real Network
                    upper_tri_mat_real = real_network[upper.tri(real_network)]
                    
                    # Upper Triangular Part of Predicted Network
                    upper_tri_mat_pre = .x[upper.tri(.x)]
                    
                    # Calculate metrics
                    accuracy = mean(upper_tri_mat_real==upper_tri_mat_pre)
                    recall = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_real==1)
                    precision = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_pre==1)
                    f_score = 2*precision*recall/(precision+recall)
                    
                    return(tibble(accuracy = accuracy,
                                  recall = recall,
                                  precision = precision,
                                  f_score = f_score))
                  })
    
    # Average metrics
    avg_metric = metrics |>
      list_rbind() |> 
      summarise_all(.funs = mean)
    
    accuracy = c(accuracy,avg_metric$accuracy)
    recall = c(recall,avg_metric$recall)
    precision = c(precision,avg_metric$precision)
    f1_score = c(f1_score,avg_metric$f_score)
  }
  
  window_size$accuracy[j] = mean(accuracy)
  window_size$recall[j] = mean(recall)
  window_size$precision[j] = mean(precision)
  window_size$f1_score[j] = mean(f1_score)
}
window_size %>% 
  pivot_longer(cols = -w) %>% 
  ggplot(aes(x = w, y = value)) +
  geom_point() +
  geom_line() +
  facet_wrap(~name,scales = 'free') +
  labs(x = 'Window size of training network data',
       y = 'Average out-of-sample metric values')

According to the average out-of-sample F1 score, the more training networks used to build tergm, the higher the F1 score of the model to predict whether there is a relationship between two currencies. Therefore, I believe that when predicting the network of \(t\)period, we should use all \(t-1\)periods networks to model to ensure a high F1 score.

Fit tergm using the network data of previous \(t-1\) periods, and predict the network of the \(t\) period, calculate accuracy, precision, recall, F1 score using the real network and the predicted network of the \(t\) period. Here, we set \(t\) to be 31~39.

for(i in 31:39){
  set.seed(1)
  temp_tergmFit = btergm(formula = network_data[1:(i-1)] ~ edges +
                           kstar(k = 5) +
                           edgecov('degree_relationship') +
                           nodecov('num_relationship') +
                           absdiff('num_relationship') +
                           gwdegree(decay = 0.5,fixed = TRUE) +
                           gwesp(decay = 0.5, fixed = TRUE) +
                           memory(type = 'stability'),
                         R = 10000, # Number of bootstrap replications
                         parallel = 'multicore',
                         ncpus = parallel::detectCores()-1,
                         usefastglm = TRUE,
                         verbose = FALSE)
  
  # Predict the network of t period
  temp_prediction = simulate(object = temp_tergmFit,
                             nsim = 10000, # number of network to simulate
                             seed = 1)
  
  # Transform predicted networks to be matrice
  pre_network = map(.x = temp_prediction,
                    .f = as.matrix)
  
  # Real network of t period
  real_network = as.matrix(network_data[[i]])
  
  # Calculate accuracy, precision, recall, F1 score for the presence of an edge in the network of t period
  
  metrics = map(.x = pre_network,
                .f = function(.x){
                  
                  # Upper Triangular Part of Real Network
                  upper_tri_mat_real = real_network[upper.tri(real_network)]
                  
                  # Upper Triangular Part of Predicted Network
                  upper_tri_mat_pre = .x[upper.tri(.x)]
                  
                  # Calculate metrics
                  accuracy = mean(upper_tri_mat_real==upper_tri_mat_pre)
                  recall = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_real==1)
                  precision = sum(upper_tri_mat_real==1 & upper_tri_mat_pre==1)/sum(upper_tri_mat_pre==1)
                  f_score = 2*precision*recall/(precision+recall)
                  
                  return(tibble(accuracy = accuracy,
                                recall = recall,
                                precision = precision,
                                f_score = f_score))
              })
  
  # Average metrics
  avg_metric = metrics |>
    list_rbind() |> 
    summarise_all(.funs = mean)
  
  # Print metrics
  cat(str_c('Accuracy for the ',i,'-th period: ',round(avg_metric$accuracy,3),'\n'))
  cat(str_c('Precision for the ',i,'-th period: ',round(avg_metric$precision,3),'\n'))
  cat(str_c('Recall for the ',i,'-th period: ',round(avg_metric$recall,3),'\n'))
  cat(str_c('F1 score for the ',i,'-th period: ',round(avg_metric$f_score,3),'\n'))
  
}
Accuracy for the 31-th period: 0.791
Precision for the 31-th period: 0.46
Recall for the 31-th period: 0.361
F1 score for the 31-th period: 0.404
Accuracy for the 32-th period: 0.781
Precision for the 32-th period: 0.278
Recall for the 32-th period: 0.308
F1 score for the 32-th period: 0.292
Accuracy for the 33-th period: 0.816
Precision for the 33-th period: 0.763
Recall for the 33-th period: 0.449
F1 score for the 33-th period: 0.565
Accuracy for the 34-th period: 0.759
Precision for the 34-th period: 0.368
Recall for the 34-th period: 0.579
F1 score for the 34-th period: 0.449
Accuracy for the 35-th period: 0.822
Precision for the 35-th period: 0.409
Recall for the 35-th period: 0.484
F1 score for the 35-th period: 0.443
Accuracy for the 36-th period: 0.742
Precision for the 36-th period: 0.414
Recall for the 36-th period: 0.25
F1 score for the 36-th period: 0.311
Accuracy for the 37-th period: 0.727
Precision for the 37-th period: 0.37
Recall for the 37-th period: 0.428
F1 score for the 37-th period: 0.397
Accuracy for the 38-th period: 0.787
Precision for the 38-th period: 0.372
Recall for the 38-th period: 0.52
F1 score for the 38-th period: 0.434
Accuracy for the 39-th period: 0.85
Precision for the 39-th period: 0.226
Recall for the 39-th period: 0.328
F1 score for the 39-th period: 0.267

9 Prediction

Forecast the next network using the second btergm model fitted using all network data.

prediction = simulate(object = tergmFit2,
                      nsim = 10000, # number of response vectors to simulate
                      seed = 1)

Transform the prediction result into matrix.

prediction_matrix = map(.x = prediction,
                        .f = as.matrix.network)

Calculate proportion of relationship.

final_prediction_matrix = reduce(.x = prediction_matrix,
                                 .f = `+`)/length(prediction)
final_prediction = ifelse(final_prediction_matrix<0.01,0,final_prediction_matrix)

10 Visualization

Define function for plotting network.

plot_history_network = function(adjacency_matrix, title, layout = layout_with_fr, size_param = 10){
  library(igraph)
  adjacency_matrix[lower.tri(adjacency_matrix)] = t(adjacency_matrix)[lower.tri(adjacency_matrix)]
  graph = graph.adjacency(abs(adjacency_matrix),
                          mode = "undirected",
                          weighted = TRUE)
  V(graph)$name = colnames(adjacency_matrix)
  V(graph)$label = V(graph)$name
  deg = degree(graph, mode = "all")
  V(graph)$size = log(deg + 2) * size_param
  E(graph)$width = E(graph)$weight * 80
  
  set.seed(1)
  result = plot(graph,
                layout = layout,
                vertex.color = "lightblue",
                edge.label.cex = 0.8,
                main = title)
  return(result)
}
# Historical networks
map2(.x = relation_data,
     .y = names(network_data),
     .f = ~plot_history_network(adjacency_matrix = .x,title = .y))

[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

[[16]]
NULL

[[17]]
NULL

[[18]]
NULL

[[19]]
NULL

[[20]]
NULL

[[21]]
NULL

[[22]]
NULL

[[23]]
NULL

[[24]]
NULL

[[25]]
NULL

[[26]]
NULL

[[27]]
NULL

[[28]]
NULL

[[29]]
NULL

[[30]]
NULL

[[31]]
NULL

[[32]]
NULL

[[33]]
NULL

[[34]]
NULL

[[35]]
NULL

[[36]]
NULL

[[37]]
NULL

[[38]]
NULL

[[39]]
NULL
# 2015-2
plot_history_network(adjacency_matrix = relation_data[[which(names(network_data)=='2015-2')]],
                     title = '2015-2',
                     layout = layout_with_gem,
                     size_param = 5)

NULL
# Function for visualize predicted network
plot_future_network = function(adjacency_matrix, 
                               title, 
                               size_param = 10, 
                               width_param = 80){
  library(igraph)
  adjacency_matrix[lower.tri(adjacency_matrix)] = t(adjacency_matrix)[lower.tri(adjacency_matrix)]
  graph = graph.adjacency(abs(adjacency_matrix),
                          mode = "undirected",
                          weighted = TRUE)
  V(graph)$name = colnames(adjacency_matrix)
  V(graph)$label = V(graph)$name
  deg = degree(graph, mode = "all")
  V(graph)$size = log(deg + 2) * size_param
  E(graph)$width = E(graph)$weight * width_param
  
  set.seed(1)
  result = plot(graph,
                layout = layout_with_fr,
                vertex.color = "lightblue",
                edge.label.cex = 0.8,
                main = title)
  return(result)
}
# Visualize predicted network
plot_future_network(adjacency_matrix = final_prediction,
                    title = '2024-2',
                    size_param = 8,
                    width_param = 5)

NULL
# Visualize predicted network for edges=1
new_final_prediction = ifelse(final_prediction==1,1,0)
plot_future_network(adjacency_matrix = new_final_prediction,
                    title = '2024-2',
                    size_param = 11,
                    width_param = 5)

NULL